##Loading the dataset
IMDB_Action <- read.csv("/Users/aerishgaba/Desktop/Projects/IMDB_Action.csv")
str(IMDB_Action)
## 'data.frame': 7800 obs. of 11 variables:
## $ Title : chr "Anora" "I'm Still Here" "Flow" "Once Upon a Time... in Hollywood" ...
## $ Title_URL : chr "https://www.imdb.com/title/tt28607951/?ref_=sr_t_1" "https://www.imdb.com/title/tt14961016/?ref_=sr_t_2" "https://www.imdb.com/title/tt4772188/?ref_=sr_t_3" "https://www.imdb.com/title/tt7131622/?ref_=sr_t_4" ...
## $ Image : chr "https://m.media-amazon.com/images/M/MV5BYThiN2M0NTItODRmNC00NDhlLWFiYTgtMWM2YTEyYzI3ZTY1XkEyXkFqcGc@._V1_QL75_U"| __truncated__ "https://m.media-amazon.com/images/M/MV5BM2FjMjBiZjgtZDkyYy00YTRlLTk5N2QtODE2ZWIyYWE0Yzg0XkEyXkFqcGc@._V1_QL75_U"| __truncated__ "https://m.media-amazon.com/images/M/MV5BOTM5ODBlOTAtYjcwZi00YzkzLWIzODEtMTM2MTZlNDFmMWU2XkEyXkFqcGc@._V1_QL75_U"| __truncated__ "https://m.media-amazon.com/images/M/MV5BMzMzNmViNjYtN2ViNi00NDM3LWFlMmItNDYyMGIzY2EzZjE2XkEyXkFqcGc@._V1_QL75_U"| __truncated__ ...
## $ ipclockupoverlay_URL: chr "https://www.imdb.com/title/tt28607951/?ref_=sr_i_1" "https://www.imdb.com/title/tt14961016/?ref_=sr_i_2" "https://www.imdb.com/title/tt4772188/?ref_=sr_i_3" "https://www.imdb.com/title/tt7131622/?ref_=sr_i_4" ...
## $ Release.Year : chr "2024" "2024" "2024" "2019" ...
## $ Duration : chr "2h 19m" "2h 17m" "1h 25m" "2h 41m" ...
## $ Rated : chr "R" "PG-13" "PG" "R" ...
## $ Rating : num 7.6 8.5 7.9 7.6 8.5 8.2 8.7 9.3 8.5 7.7 ...
## $ Votes : int 140 90 54 897 610 149 2 3 1 143 ...
## $ Popularity : int 91 85 87 84 79 85 74 82 97 94 ...
## $ Description : chr "A young escort from Brooklyn meets and impulsively marries the son of a Russian oligarch. Once the news reaches"| __truncated__ "A mother is forced to reinvent herself when her family's life is shattered by an act of arbitrary violence duri"| __truncated__ "Cat is a solitary animal, but as its home is devastated by a great flood, he finds refuge on a boat populated b"| __truncated__ "As Hollywood's Golden Age is winding down during the summer of 1969, television actor Rick Dalton and his stunt"| __truncated__ ...
df = IMDB_Action
variables = names(df)
variables
## [1] "Title" "Title_URL" "Image"
## [4] "ipclockupoverlay_URL" "Release.Year" "Duration"
## [7] "Rated" "Rating" "Votes"
## [10] "Popularity" "Description"
top_15 = head(df, 15)
top_15
missing_values = function(df) {
n = nrow(df)
missing = data.frame(
variable = character(),
missing_values = numeric(),
percentage = numeric()
)
for (i in 1:ncol(df)) {
missing[i, 1] = names(df)[i]
missing[i, 2] = sum(is.na(df[, i]))
missing[i, 3] = (missing[i, 2] / n) * 100
}
return(missing)
}
# Missing values in the dataset
missing_values(df)
df_filtered = df[df$Votes > 100, ]
df_filtered
independent_var <- df$Title
dependent_var <- df$Rating
# Create new dataframe
new_df <- data.frame(independent_var, dependent_var)
new_df
df_no_na = df[complete.cases(df[, 9]), ]
df_no_na
df_no_duplicates = df_no_na[!duplicated(df_no_na), ]
df_no_duplicates
df_ordered = df_no_duplicates[order(-df_no_duplicates$Rating), ]
df_ordered